In [1]:
# Selenium browser imports
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException

# All other imports
import pandas as pd
import numpy as np
import requests
import lxml.html as lx
from lxml import etree
from bs4 import BeautifulSoup
import re
import time
import csv
import numpy as np
import pandas as pd
In [2]:
# Get the text file of FIPS data for states and counties
response = requests.get('https://transition.fcc.gov/oet/info/maps/census/fips/fips.txt')
fips_data = response.text
In [3]:
# Modify the text output
lines = fips_data.split('\n')
fip_code = []
names =[]
for line in lines:
    if line.startswith(' ' * 3):
        state_code = line[:11].strip()
        fip_code.append(state_code)
        state_name = line[11:].strip()
        names.append(state_name)
In [4]:
# Empty list to store new values
new_names = []

# Remove "county" from each list value
for name in names:
    new_name = name.replace(' County', '')
    new_names.append(new_name)
In [5]:
# Extract the county names and code
county_name = new_names[54::]
county_code = fip_code[54::]
In [6]:
# Extract the state names and code
state_codes = fip_code[3:54]
state_names = new_names[3:54]
In [8]:
# Fips data table for states
fips_data_states = {'Code': state_codes, 'Name': state_names}
fips_data_states_df = pd.DataFrame(fips_data_states)
In [9]:
# Create new index
fips_data_states_df['Index'] = np.arange(1,len(fips_data_states_df)+1)
In [10]:
# List of state abbreviations
state_abbrev = 'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'
In [11]:
# Add state abbreviations to state data frame
fips_data_states_df['State Abbrev'] = state_abbrev
In [12]:
# Fips data table for counties
fips_data_county = {'Code': county_code, 'Name': county_name}
fips_data_county_df = pd.DataFrame(fips_data_county)

# Create new column with state codes
fips_data_county_df['State Code'] = fips_data_county_df['Code'].str[:2]

# Capitalize county names
fips_data_county_df['Name'] = fips_data_county_df['Name'].str.upper()
In [14]:
# Create a function to extract data for each state
def scrape_train_accidents(fips_data_states_df):
    # Set target url
    url = "https://safetydata.fra.dot.gov/OfficeofSafety/publicsite/query/inctmap.aspx"
    
    # Open browser
    chrome_options = Options()
    chrome_options.headless = False
    browser = webdriver.Chrome('./chromedriver', options=chrome_options)
    browser.implicitly_wait(10)
    
    # Open website
    browser.get(url)
    
    # Initialize empty lists
    county_names = []
    accs = []
    state_code = []
    
    # Iterate through each state
    for i in range(len(fips_data_states_df)):
        print(i) #sanity check
        browser.implicitly_wait(10)
        browser.get(url)

        # Select state and input state into browser
        try: 
            time.sleep(0.5)
            selector = fips_data_states_df['Index'][i]
            select_state = ("/html/body/form/div[9]/table/tbody/tr[3]/td[2]/select/option[%i]" %selector)
            state_dropdown = browser.find_element("xpath",select_state)
            WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, select_state)))
            state_dropdown.click()
            browser.implicitly_wait(10)
        except NoSuchElementException:
            print("Select_state NA")
            continue

        # Generate report for each state
        time.sleep(0.5)
        generate_report = browser.find_element("xpath","/html/body/form/div[9]/table/tbody/tr[8]/td/input[1]")
        WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, "/html/body/form/div[9]/table/tbody/tr[8]/td/input[1]")))
        generate_report.click()
        browser.implicitly_wait(10)

        # Retrieve the table from website with values
        try: 
            time.sleep(10)
            table_xpath = '//*[@id="ContentPlaceHolder1_divSAS"]/div/div/div/table'
            
            #find_table = browser.find_element("xpath",table_xpath).get_attribute('outerHTML')[0]
            WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, table_xpath)))
            table_df = pd.read_html(browser.find_element("xpath",table_xpath).get_attribute('outerHTML'))[0]
            
            # Number of counties for each state
            number_of_counties = table_df['County']['County']
            
            # List with county names
            county_names_placeholder = list(table_df['County']['County'])
            county_names.extend(county_names_placeholder)
            
            # List with accident totals for each county
            accs_placeholder = list(table_df['Totals']['Accs'])
            accs.extend(accs_placeholder)
            
            # List with state code for each county
            state_code_placeholder = [fips_data_states_df['Code'][i] for county in range(len(number_of_counties))]
            state_code.extend(state_code_placeholder)
        
            # Output all data in a list
            all_data = [county_names, accs, state_code]
        
        except TimeoutException: #Ignores the timeout, some states have a broken website link
            print("Table_df NA")
            continue
   
    
    # Close the chrome browser when done
    browser.quit()
    #return all state data
    return all_data
            
In [41]:
# Run the function an store everything as an output
output = scrape_train_accidents(fips_data_states_df)
In [16]:
# Save the ouput to csv file!!
file_name = "output.csv"

# Open the file in write mode
with open(file_name, mode='w', newline='') as csv_file:
    # create a csv writer object
    writer = csv.writer(csv_file)
    
    # Loop through the list of lists and write each row to the csv file
    for row in output:
        writer.writerow(row)
In [30]:
# Read saved output so function doesn't need to run again
output_df = pd.read_csv('output.csv', header = None)

# Preview imported csv
output_df
In [17]:
# Convert output to dataframe
output_df = pd.DataFrame(output)
output_df

# Transpose data frame (not needed for csv)
output_df = output_df.T

# Create column names
output_df.columns = ['County Names','Accidents','State Code']
In [19]:
# Match county names with county code
county_code = []
for county in range(len(output_df['County Names'])):
    try:
        state_code =  output_df['State Code'][county]
        county_name = output_df['County Names'][county]
        county_code_placeholder = fips_data_county_df.loc[(fips_data_county_df['State Code']== state_code) & (fips_data_county_df['Name']== str(county_name))].values[0]
        county_code.append(county_code_placeholder[0])
    except IndexError:
#         print(county_name,'County code NA')
        county_code.append('NA')
        continue
In [20]:
# Create new column with state codes
output_df['County Code'] = county_code

# Drop rows that have 'NA' for county code
output_df.drop(output_df[output_df['County Code'] == 'NA'].index, inplace = True)
In [22]:
# Extract state abbreviations and convert to list
state_abbrev = list(fips_data_states_df['State Abbrev'])
In [23]:
# Save the new data frame again
# save to pickle file
output_df.to_pickle("output_df.pkl")
In [6]:
# read DataFrame from pickle file
df = pd.read_pickle("my_data.pkl")
In [24]:
# Another round of imports for plotting data
import plotly.express as px
from urllib.request import urlopen
import json
In [25]:
# Convert accident values to integers
output_df['Accidents'] = output_df['Accidents'].astype(int)

# For ploting, sort accident values
sorted_output_df = output_df.sort_values('Accidents')
In [27]:
# Download and load a GeoJSON file to get county map coordinates/template
url = 'https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json'
response = requests.get(url)

with open('geojson-counties-fips.json', 'wb') as counties:
    counties.write(response.content)
    
    
# Load the GeoJSON file for US counties
with open('geojson-counties-fips.json', 'r') as counties:
    counties_geojson = json.load(counties)
In [28]:
# Download and load a GeoJSON file to get state map coordinates/template
url = 'https://gist.githubusercontent.com/wavded/1250983/raw/bf7c1c08f7b1596ca10822baeb8049d7350b0a4b/fipsToState.json'
response = requests.get(url)

with open('fipsToState.json', 'wb') as states:
    states.write(response.content)
    
# Load the GeoJSON file for US states
with open('fipsToState.json', 'r') as states:
    states_geojson = json.load(states)
In [29]:
# Group states by FIPS code and accidents, then sum for totals
sum_by_state = output_df.groupby('State Code')['Accidents'].sum()
In [30]:
# Reset index
df_sum_by_state = sum_by_state.to_frame().reset_index()
In [33]:
# Add state abbrev to data frame

# Modify the state_abbrev list to match states in dataframe
delete_index = [1,8,11,18,29,39,45] # define a list of indices to delete

# Iteratively delete indices values
for index in sorted(delete_index, reverse=True):
    del state_abbrev[index]
    
# Add to data frame
df_sum_by_state['State Abbrev'] = state_abbrev
In [34]:
# Sort data by accident number
sorted_sum_by_state = df_sum_by_state.sort_values('Accidents')
In [35]:
# Sort data by index
sorted_sum_by_index = sorted_sum_by_state.sort_index()
In [38]:
# Output a state map of accidents
state_map = px.choropleth(sorted_sum_by_state,
                     locations='State Abbrev', 
                     color='Accidents',
                     color_continuous_scale="spectral_r",
                     hover_name = 'State Code',
                     locationmode = 'USA-states',
                     labels={'acc':'train accidents'},
                     scope='usa')
state_map.show()
In [39]:
# Output a county map of accidents
county_map = px.choropleth_mapbox(sorted_output_df, geojson=counties_geojson, locations='County Code', color='Accidents',
                           color_continuous_scale="Viridis",
                           range_color=(0, max(sorted_output_df['Accidents'])),
                           mapbox_style="carto-positron",
                           zoom=3, center = {"lat": 37.0902, "lon": -95.7129},
                           opacity=0.5,
                           labels={'acc':'train accidents'}
                          )
county_map.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
county_map.show()
In [40]:
# Create a historgram to show train accidents by state 
hist = px.bar(x=sorted_sum_by_index['State Abbrev'], y=sorted_sum_by_index['Accidents'], labels={'x':'States', 'y':'Train Accidents'})
hist.show()